# ------------------------------------------------------------------------------
# NOTE:
# This script generates year-by-year document–feature matrices from the Diet
# speech data. However, for the reasons explained in the README, this script
# does not run using only the data distributed in the published replication files.
# ------------------------------------------------------------------------------

require(stringr)
require(dplyr)
require(tidyr)
require(RMeCab)
require(quanteda)
require(stringi)

# function: tokenize Japanese text with MeCab and join tokens with spaces
char.segment <- function(txts) {
  txts_seg = vector("character")
  for (i in seq_along(txts)) {
    if (txts[i] != '') {
      toks <- unlist(RMeCab::RMeCabC(txts[[i]]), use.names = FALSE)
      txts_seg[i] <- stringi::stri_c(toks, collapse = ' ')
    } else {
      txts_seg[i] = ''
    }
    if (i %% 100 == 0) cat(i, "¥n")
  }
  names(txts_seg) <- names(txts)
  return (txts_seg)
}

# create year-by-year document–feature matrix
year <- 1959
while (year <= 2019) {
  # read text data
  temp.lower <- readRDS(paste0("Speech_by_Year/committee", year, "_lower.rds"))
  temp.upper <- readRDS(paste0("Speech_by_Year/committee", year, "_upper.rds"))
  committee <- rbind(temp.lower, temp.upper)
  
  # extract statements from committee members
  committee <- committee[committee$speaker !=  "会議録情報",]
  committee <- committee[committee$speakerPosition == "",]
  committee$capacity <- committee$speech %>%
    str_sub(1, 20) %>% 
    str_replace_all("\\s+.+|\n", "") %>% ## remove speaker's names 
    str_replace( "^.+?(議員|委員|委員長|分科員|政府委員|主査|理事|座長|君(（.+）)?$)", "\\1") %>% 
    str_replace("（.+）", "")
  committee$capacity <- str_replace(committee$capacity, "^○.+", "その他")
  committee <- committee[(committee$capacity == "委員" & committee$nameOfHouse == "衆議院") | 
                         (committee$capacity == "君" & committee$nameOfHouse == "参議院"),]
  
  # remove speaker names from the beginning of each speech
  remove <- committee$speech %>% str_sub(1, 20) %>% str_replace_all("\\s+.+|\n", "")
  remove <- str_replace(remove, "\\)", "\\\\)")
  remove <- str_replace(remove, "\\(", "\\\\(")
  committee$speech <- str_replace(committee$speech, paste0(remove, "　 "), "")
  committee$speech <- str_replace(committee$speech, paste0(remove, "　"), "")
  
  # remove supplementary information
  ## 〔○○登壇〕etc.
  committee$speech <- str_replace(committee$speech, "〔.+〕", "")
  ## （発言する者あり）etc.
  committee$speech <- str_replace(committee$speech, "（発言.+）", "")
  ## （拍手） (applause)
  committee$speech <- str_replace(committee$speech, "（拍手）", "")
  ## meeting information appended after a separator such as ――――◇――――
  committee$speech <- str_replace(committee$speech, "\\―+◇.+$", "")
  # exclude speeches that explain the purpose of motions, etc.
  committee <- subset(
    committee, 
    !((stri_detect_regex(speech, "案文") & stri_detect_regex(speech, "朗読")) | 
      (stri_detect_regex(speech, "法律案|起草案|決議案|規則案|規程案") & stri_detect_regex(speech, "提案の趣旨"))
    )
  )
  
  # recode committee names (harmonize across mergers/splits)
  committee$nameOfMeeting <- as.character(committee$nameOfMeeting)
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("総務委員会") & 
      committee$session %in% 142:150 & 
      committee$nameOfHouse == "参議院"
    ] <- "内閣委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("地方行政委員会", "逓信委員会", "地方行政・警察委員会")
    ] <- "総務委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("外務委員会", "安全保障委員会", "外交・防衛委員会")
    ] <- "外交防衛委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("大蔵委員会", "財政・金融委員会", "財政金融委員会")
    ] <- "財務金融委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("文教委員会", "文教・科学委員会", "文教科学委員会", "科学技術委員会")
    ] <- "文部科学委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("社会労働委員会", "厚生委員会", "労働委員会", "国民福祉委員会", "労働・社会政策委員会")
    ] <- "厚生労働委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("商工委員会", "経済・産業委員会")
    ] <- "経済産業委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("運輸委員会", "建設委員会")
    ] <- "国土交通委員会"
  committee$nameOfMeeting[
    committee$nameOfMeeting %in% c("交通・情報通信委員会", "国土・環境委員会")
    ] <- "交通・情報通信・国土・環境委員会"
  committee <- committee[committee$nameOfMeeting %in% 
                         c("内閣委員会", "総務委員会", "法務委員会", "外交防衛委員会", 
                           "財務金融委員会", "文部科学委員会", "厚生労働委員会", 
                           "農林水産委員会", "経済産業委員会", "国土交通委員会", 
                           "交通・情報通信・国土・環境委員会", "環境委員会"),]
  committee$nameOfMeeting <- factor(
    committee$nameOfMeeting, 
    levels = c("内閣委員会", "総務委員会", "法務委員会", "外交防衛委員会", 
               "財務金融委員会", "文部科学委員会", "厚生労働委員会", 
               "農林水産委員会", "経済産業委員会", "国土交通委員会", 
               "交通・情報通信・国土・環境委員会", "環境委員会")
  )
  print(table(committee$nameOfMeeting, committee$nameOfHouse))
  
  # build corpus
  committee$speech <- chartr("１２３４５６７８９０", "一二三四五六七八九〇", committee$speech)
  committee$speech <- chartr("1234567890", "一二三四五六七八九〇", committee$speech)
  committee.corpus <- corpus(committee, text_field = "speech")
  texts(committee.corpus) <- char.segment(texts(committee.corpus))
  
  # document-feature matrix
  committee.dfm <- dfm(committee.corpus, tolower = FALSE, 
                       remove_punct = TRUE, what = "fastestword")
  ## remove hiragana-only tokens (often particles/auxiliary verbs)
  committee.dfm <- dfm_remove(committee.dfm, "^[ぁ-ん]+$", valuetype = "regex")
  ## remove numerals (Arabic and kanji)
  committee.dfm <- dfm_remove(
    committee.dfm, "^[一二三四五六七八九十百千〇１２３４５６７８９０1234567890]+$", valuetype = "regex"
  )
  ## keep only tokens consisting of Japanese characters, kana, or alphanumerics (drop brackets/punctuation)
  committee.dfm <- dfm_keep(
    committee.dfm, "^[０-９ａ-ｚぁ-んァ-ヶー一-龠〇]+$", valuetype = "regex"
  )
  
  # save year-by-year document-feature matrix
  saveRDS(committee.dfm, file = paste0("dfm/dfm_", year, ".rds"))
  print(paste0("complete work on ", year, " data"))
  year <- year + 1
}
